{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Time Transforms\n",
    "\n",
    "Run this notebook in Vertex Workbench. In this notebook, we will use extracts from the timestamp\n",
    "as input features."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parts of time\n",
    "\n",
    "Use extracted value from the timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1232.41query/s]                        \n",
      "Downloading: 100%|██████████| 5/5 [00:01<00:00,  4.88rows/s]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ontime</th>\n",
       "      <th>dep_delay</th>\n",
       "      <th>taxi_out</th>\n",
       "      <th>distance</th>\n",
       "      <th>origin</th>\n",
       "      <th>dest</th>\n",
       "      <th>dep_hour</th>\n",
       "      <th>dep_day</th>\n",
       "      <th>is_eval_day</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ontime</td>\n",
       "      <td>-5.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>399.0</td>\n",
       "      <td>ANC</td>\n",
       "      <td>BET</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>late</td>\n",
       "      <td>33.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>1046.0</td>\n",
       "      <td>FLL</td>\n",
       "      <td>SJU</td>\n",
       "      <td>14</td>\n",
       "      <td>7</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ontime</td>\n",
       "      <td>-3.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>95.0</td>\n",
       "      <td>SIT</td>\n",
       "      <td>JNU</td>\n",
       "      <td>13</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ontime</td>\n",
       "      <td>5.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>201.0</td>\n",
       "      <td>LIH</td>\n",
       "      <td>OGG</td>\n",
       "      <td>22</td>\n",
       "      <td>6</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ontime</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>204.0</td>\n",
       "      <td>BRW</td>\n",
       "      <td>SCC</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ontime  dep_delay  taxi_out  distance origin dest  dep_hour  dep_day  \\\n",
       "0  ontime       -5.0      10.0     399.0    ANC  BET         3        5   \n",
       "1    late       33.0      13.0    1046.0    FLL  SJU        14        7   \n",
       "2  ontime       -3.0       8.0      95.0    SIT  JNU        13        5   \n",
       "3  ontime        5.0       9.0     201.0    LIH  OGG        22        6   \n",
       "4  ontime       -4.0       5.0     204.0    BRW  SCC         4        4   \n",
       "\n",
       "   is_eval_day  \n",
       "0         True  \n",
       "1        False  \n",
       "2        False  \n",
       "3         True  \n",
       "4        False  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  EXTRACT(hour FROM dep_time) AS dep_hour,\n",
    "  EXTRACT(dayofweek FROM dep_time) AS dep_day,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False\n",
    "LIMIT 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1603.53query/s]                        \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "CREATE OR REPLACE MODEL dsongcp.arr_delay_airports_timefeatures\n",
    "TRANSFORM(\n",
    "   * EXCEPT(dep_time),\n",
    "   EXTRACT(hour FROM dep_time) AS dep_hour,\n",
    "   EXTRACT(dayofweek FROM dep_time) AS dep_day\n",
    ")\n",
    "OPTIONS(input_label_cols=['ontime'], \n",
    "        model_type='boosted_tree_classifier',\n",
    "        data_split_method='custom',\n",
    "        data_split_col='is_eval_day',\n",
    "        l2_reg=2.5,\n",
    "        max_tree_depth=10)\n",
    "\n",
    "AS\n",
    "\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  dep_time,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1869.12query/s]                        \n",
      "Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.33s/rows]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>correct_cancel</th>\n",
       "      <th>total_noncancel</th>\n",
       "      <th>correct_noncancel</th>\n",
       "      <th>total_cancel</th>\n",
       "      <th>rmse</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.841209</td>\n",
       "      <td>1306019</td>\n",
       "      <td>0.965501</td>\n",
       "      <td>283750</td>\n",
       "      <td>0.204265</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   correct_cancel  total_noncancel  correct_noncancel  total_cancel      rmse\n",
       "0        0.841209          1306019           0.965501        283750  0.204265"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "\n",
    "WITH predictions AS (\n",
    "SELECT \n",
    "  *\n",
    "FROM ML.PREDICT(MODEL dsongcp.arr_delay_airports_timefeatures,\n",
    "                 (\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  dep_time,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False AND\n",
    "  t.is_train_day = 'False'\n",
    "                 ),\n",
    "                 STRUCT(0.7 AS threshold))),\n",
    "\n",
    "stats AS (\n",
    "SELECT \n",
    "  COUNTIF(ontime != 'ontime' AND ontime = predicted_ontime) AS correct_cancel\n",
    "  , COUNTIF(predicted_ontime = 'ontime') AS total_noncancel\n",
    "  , COUNTIF(ontime = 'ontime' AND ontime = predicted_ontime) AS correct_noncancel\n",
    "  , COUNTIF(ontime != 'ontime') AS total_cancel\n",
    "  , SQRT(SUM((IF(ontime = 'ontime', 1, 0) - p.prob) * (IF(ontime = 'ontime', 1, 0) - p.prob))/COUNT(*)) AS rmse\n",
    "FROM predictions, UNNEST(predicted_ontime_probs) p\n",
    "WHERE p.label = 'ontime'\n",
    ")\n",
    "\n",
    "SELECT\n",
    "   correct_cancel / total_cancel AS correct_cancel\n",
    "   , total_noncancel\n",
    "   , correct_noncancel / total_noncancel AS correct_noncancel\n",
    "   , total_cancel\n",
    "   , rmse\n",
    "FROM stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Day of week as categorical\n",
    "\n",
    "By default, BigQuery ML treats numbers as numeric. We want day of week to be categorical.\n",
    "We can cast it as a string, or we can treat it as weekday/weekend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1811.01query/s]                        \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "CREATE OR REPLACE MODEL dsongcp.arr_delay_airports_timefeatures\n",
    "TRANSFORM(\n",
    "   * EXCEPT(dep_time),\n",
    "   EXTRACT(hour FROM dep_time) AS dep_hour,\n",
    "   IF(EXTRACT(dayofweek FROM dep_time) BETWEEN 2 and 6, 'weekday', 'weekend') AS dep_day\n",
    ")\n",
    "OPTIONS(input_label_cols=['ontime'], \n",
    "        model_type='boosted_tree_classifier',\n",
    "        data_split_method='custom',\n",
    "        data_split_col='is_eval_day',\n",
    "        l2_reg=2.5,\n",
    "        max_tree_depth=10)\n",
    "\n",
    "AS\n",
    "\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  dep_time,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1529.28query/s]                        \n",
      "Downloading: 100%|██████████| 1/1 [00:00<00:00,  1.24rows/s]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>correct_cancel</th>\n",
       "      <th>total_noncancel</th>\n",
       "      <th>correct_noncancel</th>\n",
       "      <th>total_cancel</th>\n",
       "      <th>rmse</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.840885</td>\n",
       "      <td>1306601</td>\n",
       "      <td>0.965445</td>\n",
       "      <td>283750</td>\n",
       "      <td>0.204212</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   correct_cancel  total_noncancel  correct_noncancel  total_cancel      rmse\n",
       "0        0.840885          1306601           0.965445        283750  0.204212"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "\n",
    "WITH predictions AS (\n",
    "SELECT \n",
    "  *\n",
    "FROM ML.PREDICT(MODEL dsongcp.arr_delay_airports_timefeatures,\n",
    "                 (\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  dep_time,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False AND\n",
    "  t.is_train_day = 'False'\n",
    "                 ),\n",
    "                 STRUCT(0.7 AS threshold))),\n",
    "\n",
    "stats AS (\n",
    "SELECT \n",
    "  COUNTIF(ontime != 'ontime' AND ontime = predicted_ontime) AS correct_cancel\n",
    "  , COUNTIF(predicted_ontime = 'ontime') AS total_noncancel\n",
    "  , COUNTIF(ontime = 'ontime' AND ontime = predicted_ontime) AS correct_noncancel\n",
    "  , COUNTIF(ontime != 'ontime') AS total_cancel\n",
    "  , SQRT(SUM((IF(ontime = 'ontime', 1, 0) - p.prob) * (IF(ontime = 'ontime', 1, 0) - p.prob))/COUNT(*)) AS rmse\n",
    "FROM predictions, UNNEST(predicted_ontime_probs) p\n",
    "WHERE p.label = 'ontime'\n",
    ")\n",
    "\n",
    "SELECT\n",
    "   correct_cancel / total_cancel AS correct_cancel\n",
    "   , total_noncancel\n",
    "   , correct_noncancel / total_noncancel AS correct_noncancel\n",
    "   , total_cancel\n",
    "   , rmse\n",
    "FROM stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Cross\n",
    "\n",
    "Use the combination of hour and day at a specific airport.\n",
    "bucketize the hour as well so that we don't have too many hours."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1771.24query/s]                        \n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: []"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "CREATE OR REPLACE MODEL dsongcp.arr_delay_airports_timefeatures\n",
    "TRANSFORM(\n",
    "   * EXCEPT(dep_time),\n",
    "   ML.FEATURE_CROSS(STRUCT(\n",
    "     ML.BUCKETIZE(EXTRACT(hour FROM dep_time), [0, 4, 8, 12, 16, 20]) AS dep_hour,\n",
    "     IF(EXTRACT(dayofweek FROM dep_time) BETWEEN 2 and 6, \n",
    "                'weekday', 'weekend') AS dep_day,\n",
    "     origin\n",
    "   )) AS day_hour\n",
    ")\n",
    "OPTIONS(input_label_cols=['ontime'], \n",
    "        model_type='boosted_tree_classifier',\n",
    "        data_split_method='custom',\n",
    "        data_split_col='is_eval_day',\n",
    "        l2_reg=2.5,\n",
    "        max_tree_depth=10\n",
    "       )\n",
    "\n",
    "AS\n",
    "\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  dep_time,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1145.36query/s]                        \n",
      "Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.06s/rows]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>correct_cancel</th>\n",
       "      <th>total_noncancel</th>\n",
       "      <th>correct_noncancel</th>\n",
       "      <th>total_cancel</th>\n",
       "      <th>rmse</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.841008</td>\n",
       "      <td>1306231</td>\n",
       "      <td>0.965462</td>\n",
       "      <td>283750</td>\n",
       "      <td>0.204265</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   correct_cancel  total_noncancel  correct_noncancel  total_cancel      rmse\n",
       "0        0.841008          1306231           0.965462        283750  0.204265"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%bigquery\n",
    "\n",
    "WITH predictions AS (\n",
    "SELECT \n",
    "  *\n",
    "FROM ML.PREDICT(MODEL dsongcp.arr_delay_airports_timefeatures,\n",
    "                 (\n",
    "SELECT\n",
    "  IF(arr_delay < 15, 'ontime', 'late') AS ontime,\n",
    "  dep_delay,\n",
    "  taxi_out,\n",
    "  distance,\n",
    "  origin,\n",
    "  dest,\n",
    "  dep_time,\n",
    "  IF(is_train_day = 'True', False, True) AS is_eval_day\n",
    "FROM dsongcp.flights_tzcorr f\n",
    "JOIN dsongcp.trainday t\n",
    "ON f.FL_DATE = t.FL_DATE\n",
    "WHERE\n",
    "  f.CANCELLED = False AND \n",
    "  f.DIVERTED = False AND\n",
    "  t.is_train_day = 'False'\n",
    "                 ),\n",
    "                 STRUCT(0.7 AS threshold))),\n",
    "\n",
    "stats AS (\n",
    "SELECT \n",
    "  COUNTIF(ontime != 'ontime' AND ontime = predicted_ontime) AS correct_cancel\n",
    "  , COUNTIF(predicted_ontime = 'ontime') AS total_noncancel\n",
    "  , COUNTIF(ontime = 'ontime' AND ontime = predicted_ontime) AS correct_noncancel\n",
    "  , COUNTIF(ontime != 'ontime') AS total_cancel\n",
    "  , SQRT(SUM((IF(ontime = 'ontime', 1, 0) - p.prob) * (IF(ontime = 'ontime', 1, 0) - p.prob))/COUNT(*)) AS rmse\n",
    "FROM predictions, UNNEST(predicted_ontime_probs) p\n",
    "WHERE p.label = 'ontime'\n",
    ")\n",
    "\n",
    "SELECT\n",
    "   correct_cancel / total_cancel AS correct_cancel\n",
    "   , total_noncancel\n",
    "   , correct_noncancel / total_noncancel AS correct_noncancel\n",
    "   , total_cancel\n",
    "   , rmse\n",
    "FROM stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright 2021 Google Inc. Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
   ]
  }
 ],
 "metadata": {
  "environment": {
   "kernel": "python3",
   "name": "managed-notebooks.m82",
   "type": "gcloud",
   "uri": "gcr.io/deeplearning-platform-release/base-cu110:latest"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
